In [1]:
import os
import sys
spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.8.2.1-src.zip'))
print sys.path
execfile(os.path.join(spark_home, 'python/pyspark/shell.py'))
In [2]:
from pyspark.mllib.feature import Word2Vec
inp = sc.textFile('./data/new_parsed_no_spam.txt').map(lambda row: row.split(" "))
word2vec = Word2Vec()
model = word2vec.fit(inp)
In [33]:
pickle.dump(user_tags,open("./data/user_tags.pkl",'w'))
In [72]:
import numpy as np
import pickle
import jieba ,util
import csv ,json
jieba.load_userdict("./new.dict_all")
stop_words = util.load_stop_words('stopword.txt')
from pyspark.mllib.feature import Vectors
stop_words = util.load_stop_words('stopword.txt')
for line in open("./data/cookies_tags.csv.1"):#buf :
cookie ,tags = line.split(',')
b = Vectors.dense(np.zeros(100))
count = 0
for tag in tags.split(':') :
if len(tag) > 4 :
tags = jieba.cut(tag,cut_all = False)
for item in tags :
if item in stop_words: continue
try:
b = b + model.transform(item)
count = count + 1
except ValueError :
pass
else :
try:
b = b + model.transform(tag)
count = count + 1
except ValueError :
pass
if count == 0 :
user_tags[cookie] = b
else :
user_tags[cookie] = b/count
In [78]:
import csv
with open('./data/cookies_tags.csv.1.vec', 'w') as csvfile:
writer = csv.writer(csvfile, delimiter=' ',
quotechar='|', quoting=csv.QUOTE_MINIMAL)
for cookie in user_tags :
vec = user_tags[cookie]
writer.writerow(vec)
In [65]:
from pyspark.mllib.clustering import KMeans, KMeansModel
from numpy import array
from math import sqrt
# Load and parse the data
data = sc.textFile("./data/cookies_tags.csv.1.vec")
parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))
# Build the model (cluster the data)
clusters = KMeans.train(parsedData, 10, maxIterations=1000,
runs=100, initializationMode="random")
# Evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
center = clusters.centers[clusters.predict(point)]
return sqrt(sum([x**2 for x in (point - center)]))
WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Within Set Sum of Squared Error = " + str(WSSSE))
# Save and load model
In [66]:
c1 = clusters.clusterCenters[0]
count = 0
for c1 in clusters.clusterCenters :
print "cluster: " + str(count)
count = count + 1
synonyms = model.findSynonyms(c1, 10)
for word, cosine_distance in synonyms:
print("{}: {}".format(word.encode('utf-8'), cosine_distance))
In [79]:
import csv
with open('./data/cookies_tags.csv.1.cluster', 'w') as csvfile:
writer = csv.writer(csvfile, delimiter=' ',
quotechar='|', quoting=csv.QUOTE_MINIMAL)
for cookie in user_tags :
vec = user_tags[cookie]
label = clusters.predict(vec)
l = list(vec)
l.insert(0,label)
l.insert(0,cookie)
writer.writerow(l)
In [ ]: